#!/usr/bin/Rscript

# Libraries
require(stringr)
require(igraph)


# Define date_cut
cut_date <- as.Date("2014-01-31")

source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/database.R')

# Load tables
db <- "pp_forum_apr2014.sqlite"
author <- sqliteGetTable(db, "author")
thread <- sqliteGetTable(db, "thread")
post <- sqliteGetTable(db, "post")

# Create directreply graph
users <- data.frame(name=author$authorId,
                    authorId=author$authorId,
                    authorName=author$authorName)

returnTargetAuthorId <- function (thisId,
                                  postId, postMessage, authorId,
                                  threadId, postOrdinalNumber) {
  pat <- "pid=[0-9]+"
  
  msg <- postMessage[postId==thisId]
  
  # Does the message contains a direct reply?
  if ((grepl("<blockquote><cite>", msg)) && (grepl(pat, msg))) {
    # Find targeted postId
    pid <- str_extract(msg, pat)
    pid <- gsub("=", "", pid)
    # Return targeted authorId
    aid <- (authorId[postId==pid])
    if (length(aid)!=0) {
      return(aid)
    } else {
      return(pid)
    }
  } else {
    # Return authorId of first post of thread
     return(authorId[threadId==threadId[postId==thisId] 
                    & postOrdinalNumber==1])
  }
}

relations <- data.frame()
i <- 1

for (thisThreadId in unique(post$threadId)) {
  
  # Subset post df
  subPost <- subset(post, threadId==thisThreadId)
  
  # Computationally intensive
  targetAuthorId <- sapply(subPost$postId, returnTargetAuthorId,
                           subPost$postId, subPost$postMessage, subPost$authorId,
                           subPost$threadId, subPost$postOrdinalNumber,
                           USE.NAMES = FALSE)
  
  # print(unlist(targetAuthorId))
  
  relation <- data.frame(from = subPost$authorId,
                         to = unlist(targetAuthorId),
                         postId = subPost$postId,
                         postTitle = subPost$postTitle,
                         postDatetime = subPost$postDatetime,
                         postOrdinalNumber = subPost$postOrdinalNumber,
                         postMessage = subPost$postMessage,
                         authorId = subPost$authorId,
                         threadId = subPost$threadId )
  
  cat(paste0("Remaining loops: ", length(unique(post$threadId)) - i, "\n"))
  cat(paste0(i,"\n"))
  # cat(paste0(thisThreadId,"\n"))
  i <- i + 1

  relations <- rbind(relations, relation)
}

# Identify relations with missing aid
# missing_aid <- relations$to[grepl("pid",relations$to)]

for (row in 1:nrow(relations)) {
    if (grepl("pid",relations$to[row])) {
        print(relations$to[row])
        print(post$authorId[post$postId==relations$to[row]])
    }
}

# Cleaning
relations <- subset(relations,!is.na(to))
relations <- relations[-which(is.na(match(relations$to, users$name))),]

g_pp_direct_reply <- graph.data.frame(relations, directed=TRUE, vertices=users)
save(g_pp_direct_reply, file="02_24_pp_direct_reply_graph.RData")

library(openssl)
V(g_pp_direct_reply)$authorName <- md5(V(g_pp_direct_reply)$authorName)
save(g_pp_direct_reply, file="02_24_pp_direct_reply_graph_anonymised.RData")



